In [1]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
#loading the dataset
df=pd.read_csv('Titanic.csv')
In [3]:
#displaying the first five columns of the data
df.head()
Out[3]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | False | 0 |
| 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | 1 |
| 2 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | True | 1 |
| 3 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | 1 |
| 4 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | 0 |
In [4]:
#displaying the last five columns of the data
df.tail()
Out[4]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 886 | male | 27.0 | 0 | 0 | 13.00 | S | Second | man | True | 0 |
| 887 | female | 19.0 | 0 | 0 | 30.00 | S | First | woman | True | 1 |
| 888 | female | NaN | 1 | 2 | 23.45 | S | Third | woman | False | 0 |
| 889 | male | 26.0 | 0 | 0 | 30.00 | C | First | man | True | 1 |
| 890 | male | 32.0 | 0 | 0 | 7.75 | Q | Third | man | True | 0 |
In [5]:
#getting the no of rows in our data
len(df)
Out[5]:
891
In [6]:
#getting the columns in our data
df.columns
Out[6]:
Index(['sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who',
'alone', 'survived'],
dtype='object')
diplaying the number of values in each sample class
In [7]:
df['age'].value_counts()
Out[7]:
age
24.00 30
22.00 27
18.00 26
19.00 25
28.00 25
..
36.50 1
55.50 1
0.92 1
23.50 1
74.00 1
Name: count, Length: 88, dtype: int64
In [8]:
df['fare'].value_counts()
Out[8]:
fare
8.0500 43
13.0000 42
7.8958 38
7.7500 34
26.0000 31
..
35.0000 1
28.5000 1
6.2375 1
14.0000 1
10.5167 1
Name: count, Length: 248, dtype: int64
In [9]:
df['who'].value_counts()
Out[9]:
who man 537 woman 271 child 83 Name: count, dtype: int64
In [10]:
df['alone'].value_counts()
Out[10]:
alone True 537 False 354 Name: count, dtype: int64
In [11]:
df['class'].value_counts()
Out[11]:
class Third 491 First 216 Second 184 Name: count, dtype: int64
In [12]:
df['alone'].value_counts()
Out[12]:
alone True 537 False 354 Name: count, dtype: int64
In [13]:
df.describe()
Out[13]:
| age | sibsp | parch | fare | survived | |
|---|---|---|---|---|---|
| count | 714.000000 | 891.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 29.699118 | 0.523008 | 0.381594 | 32.204208 | 0.383838 |
| std | 14.526497 | 1.102743 | 0.806057 | 49.693429 | 0.486592 |
| min | 0.420000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 20.125000 | 0.000000 | 0.000000 | 7.910400 | 0.000000 |
| 50% | 28.000000 | 0.000000 | 0.000000 | 14.454200 | 0.000000 |
| 75% | 38.000000 | 1.000000 | 0.000000 | 31.000000 | 1.000000 |
| max | 80.000000 | 8.000000 | 6.000000 | 512.329200 | 1.000000 |
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sex 891 non-null object 1 age 714 non-null float64 2 sibsp 891 non-null int64 3 parch 891 non-null int64 4 fare 891 non-null float64 5 embarked 889 non-null object 6 class 891 non-null object 7 who 891 non-null object 8 alone 891 non-null bool 9 survived 891 non-null int64 dtypes: bool(1), float64(2), int64(3), object(4) memory usage: 63.6+ KB
In [15]:
df.isnull().sum()
Out[15]:
sex 0 age 177 sibsp 0 parch 0 fare 0 embarked 2 class 0 who 0 alone 0 survived 0 dtype: int64
VISUALIZATION 1.HISTOGRAMS
In [16]:
df['age'].hist()
Out[16]:
<Axes: >
In [17]:
df['who'].hist()
Out[17]:
<Axes: >
In [18]:
df['fare'].hist()
Out[18]:
<Axes: >
In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
In [20]:
df.shape
Out[20]:
(891, 10)
In [21]:
df['sex']=le.fit_transform(df['sex'])
df.head()
Out[21]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 22.0 | 1 | 0 | 7.2500 | S | Third | man | False | 0 |
| 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | 1 |
| 2 | 0 | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | True | 1 |
| 3 | 0 | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | 1 |
| 4 | 1 | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | 0 |
In [22]:
df['who']=le.fit_transform(df['who'])
df.head()
df.tail()
Out[22]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 886 | 1 | 27.0 | 0 | 0 | 13.00 | S | Second | 1 | True | 0 |
| 887 | 0 | 19.0 | 0 | 0 | 30.00 | S | First | 2 | True | 1 |
| 888 | 0 | NaN | 1 | 2 | 23.45 | S | Third | 2 | False | 0 |
| 889 | 1 | 26.0 | 0 | 0 | 30.00 | C | First | 1 | True | 1 |
| 890 | 1 | 32.0 | 0 | 0 | 7.75 | Q | Third | 1 | True | 0 |
In [23]:
df['alone']=le.fit_transform(df['alone'])
df.head()
Out[23]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 22.0 | 1 | 0 | 7.2500 | S | Third | 1 | 0 | 0 |
| 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C | First | 2 | 0 | 1 |
| 2 | 0 | 26.0 | 0 | 0 | 7.9250 | S | Third | 2 | 1 | 1 |
| 3 | 0 | 35.0 | 1 | 0 | 53.1000 | S | First | 2 | 0 | 1 |
| 4 | 1 | 35.0 | 0 | 0 | 8.0500 | S | Third | 1 | 1 | 0 |
In [24]:
df['class']=le.fit_transform(df['class'])
df.head()
Out[24]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 22.0 | 1 | 0 | 7.2500 | S | 2 | 1 | 0 | 0 |
| 1 | 0 | 38.0 | 1 | 0 | 71.2833 | C | 0 | 2 | 0 | 1 |
| 2 | 0 | 26.0 | 0 | 0 | 7.9250 | S | 2 | 2 | 1 | 1 |
| 3 | 0 | 35.0 | 1 | 0 | 53.1000 | S | 0 | 2 | 0 | 1 |
| 4 | 1 | 35.0 | 0 | 0 | 8.0500 | S | 2 | 1 | 1 | 0 |
In [25]:
df['embarked']=le.fit_transform(df['embarked'])
df.head()
Out[25]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 22.0 | 1 | 0 | 7.2500 | 2 | 2 | 1 | 0 | 0 |
| 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 0 | 0 | 2 | 0 | 1 |
| 2 | 0 | 26.0 | 0 | 0 | 7.9250 | 2 | 2 | 2 | 1 | 1 |
| 3 | 0 | 35.0 | 1 | 0 | 53.1000 | 2 | 0 | 2 | 0 | 1 |
| 4 | 1 | 35.0 | 0 | 0 | 8.0500 | 2 | 2 | 1 | 1 | 0 |
In [26]:
df.corr()
Out[26]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| sex | 1.000000 | 0.093254 | -0.114631 | -0.245489 | -0.182333 | 0.104057 | 0.131900 | -0.639773 | 0.303646 | -0.543351 |
| age | 0.093254 | 1.000000 | -0.308247 | -0.189119 | 0.096067 | -0.025252 | -0.369226 | 0.378685 | 0.198270 | -0.077221 |
| sibsp | -0.114631 | -0.308247 | 1.000000 | 0.414838 | 0.159651 | 0.066654 | 0.083081 | -0.136003 | -0.584471 | -0.035322 |
| parch | -0.245489 | -0.189119 | 0.414838 | 1.000000 | 0.216225 | 0.038322 | 0.018443 | -0.055682 | -0.583398 | 0.081629 |
| fare | -0.182333 | 0.096067 | 0.159651 | 0.216225 | 1.000000 | -0.221226 | -0.549500 | 0.146290 | -0.271832 | 0.257307 |
| embarked | 0.104057 | -0.025252 | 0.066654 | 0.038322 | -0.221226 | 1.000000 | 0.157112 | -0.060177 | 0.065610 | -0.163517 |
| class | 0.131900 | -0.369226 | 0.083081 | 0.018443 | -0.549500 | 0.157112 | 1.000000 | -0.196793 | 0.135207 | -0.338481 |
| who | -0.639773 | 0.378685 | -0.136003 | -0.055682 | 0.146290 | -0.060177 | -0.196793 | 1.000000 | 0.006540 | 0.325753 |
| alone | 0.303646 | 0.198270 | -0.584471 | -0.583398 | -0.271832 | 0.065610 | 0.135207 | 0.006540 | 1.000000 | -0.203367 |
| survived | -0.543351 | -0.077221 | -0.035322 | 0.081629 | 0.257307 | -0.163517 | -0.338481 | 0.325753 | -0.203367 | 1.000000 |
In [27]:
corr=df.corr()
fig, ax= plt.subplots(figsize=(10,10))
sns.heatmap(corr, annot =True, ax=ax,cmap='coolwarm')
Out[27]:
<Axes: >
In [28]:
#scatterplots
sns.pairplot(df,hue='survived')
plt.show()
In [29]:
sns.boxplot(y='age', data=df,width=0.2)
Out[29]:
<Axes: ylabel='age'>
In [30]:
sns.boxplot(y='age',x='sex',hue='survived', data=df,width=0.2)
Out[30]:
<Axes: xlabel='sex', ylabel='age'>
In [31]:
df.head(10)
Out[31]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | survived | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 22.0 | 1 | 0 | 7.2500 | 2 | 2 | 1 | 0 | 0 |
| 1 | 0 | 38.0 | 1 | 0 | 71.2833 | 0 | 0 | 2 | 0 | 1 |
| 2 | 0 | 26.0 | 0 | 0 | 7.9250 | 2 | 2 | 2 | 1 | 1 |
| 3 | 0 | 35.0 | 1 | 0 | 53.1000 | 2 | 0 | 2 | 0 | 1 |
| 4 | 1 | 35.0 | 0 | 0 | 8.0500 | 2 | 2 | 1 | 1 | 0 |
| 5 | 1 | NaN | 0 | 0 | 8.4583 | 1 | 2 | 1 | 1 | 0 |
| 6 | 1 | 54.0 | 0 | 0 | 51.8625 | 2 | 0 | 1 | 1 | 0 |
| 7 | 1 | 2.0 | 3 | 1 | 21.0750 | 2 | 2 | 0 | 0 | 0 |
| 8 | 0 | 27.0 | 0 | 2 | 11.1333 | 2 | 2 | 2 | 0 | 1 |
| 9 | 0 | 14.0 | 1 | 0 | 30.0708 | 0 | 1 | 0 | 0 | 1 |
In [33]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[33], line 3 1 from sklearn.impute import SimpleImputer 2 imputer = SimpleImputer(strategy='mean') ----> 3 x_train = imputer.fit_transform(x_train) 4 x_test = imputer.transform(x_test) NameError: name 'x_train' is not defined
MODEL TRAINING
In [34]:
from sklearn.model_selection import train_test_split
X=df.drop(columns=['survived'])
Y=df['survived']
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
In [35]:
print(y_train.unique()) # If using pandas Series
[0 1]
In [36]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)
In [37]:
#LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
C:\Users\Win\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
Out[37]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
Print metric to get performance
In [38]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 79.88826815642457
In [39]:
#K-NEAREST NEIGHBORS
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier()
model.fit(x_train,y_train)
Out[39]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [40]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 71.50837988826815
In [41]:
#DECISION TREE
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
model.fit(x_train,y_train)
Out[41]:
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [42]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 79.3296089385475
In [43]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(x_train,y_train)
Out[43]:
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [44]:
print("Accuracy:",model.score(x_test,y_test)*100)
Accuracy: 82.12290502793296
In [45]:
#use a trained model to make predict
print(x_test[:5]) # Show first 5 rows of the NumPy array
[[ 1. 29.49884615 1. 1. 15.2458 0. 2. 1. 0. ] [ 1. 31. 0. 0. 10.5 2. 1. 1. 1. ] [ 1. 20. 0. 0. 7.925 2. 2. 1. 1. ] [ 0. 6. 0. 1. 33. 2. 1. 0. 0. ] [ 0. 14. 1. 0. 11.2417 0. 2. 0. 0. ]]
In [46]:
import pandas as pd
# You need to know the column names — assuming X.columns from the original DataFrame
x_test_df = pd.DataFrame(x_test, columns=X.columns)
x_test_df.head()
Out[46]:
| sex | age | sibsp | parch | fare | embarked | class | who | alone | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 29.498846 | 1.0 | 1.0 | 15.2458 | 0.0 | 2.0 | 1.0 | 0.0 |
| 1 | 1.0 | 31.000000 | 0.0 | 0.0 | 10.5000 | 2.0 | 1.0 | 1.0 | 1.0 |
| 2 | 1.0 | 20.000000 | 0.0 | 0.0 | 7.9250 | 2.0 | 2.0 | 1.0 | 1.0 |
| 3 | 0.0 | 6.000000 | 0.0 | 1.0 | 33.0000 | 2.0 | 1.0 | 0.0 | 0.0 |
| 4 | 0.0 | 14.000000 | 1.0 | 0.0 | 11.2417 | 0.0 | 2.0 | 0.0 | 0.0 |
In [47]:
y_test.head()
Out[47]:
709 1 439 0 840 0 720 1 39 1 Name: survived, dtype: int64
In [48]:
clf= RandomForestClassifier()
clf.fit(x_train,y_train)
(clf.score(x_test,y_test)*100)
Out[48]:
82.12290502793296
In [49]:
clf.predict(x_test)
Out[49]:
array([0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0,
0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
0, 1, 1], dtype=int64)
In [50]:
np.array(y_test)
Out[50]:
array([1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,
1, 1, 1], dtype=int64)
In [51]:
#compare the prediction to the truth labels to evaluate the model
y_preds=clf.predict(x_test)
(np.mean(y_preds == y_test)*100)
Out[51]:
82.12290502793296
In [52]:
pip install plotly
Requirement already satisfied: plotly in c:\users\win\appdata\local\programs\python\python312\lib\site-packages (6.1.2) Requirement already satisfied: narwhals>=1.15.1 in c:\users\win\appdata\local\programs\python\python312\lib\site-packages (from plotly) (1.43.0) Requirement already satisfied: packaging in c:\users\win\appdata\local\programs\python\python312\lib\site-packages (from plotly) (23.2) Note: you may need to restart the kernel to use updated packages.
[notice] A new release of pip is available: 24.0 -> 25.1.1 [notice] To update, run: python.exe -m pip install --upgrade pip
In [53]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook' # or 'iframe', 'colab', or 'browser' if needed
fig = px.bar(df, x='age', y='sex', color='survived')
fig.show()
In [61]:
df.plot()
plt.title("a general plot for the dataframe")
plt.show()
In [60]:
df.plot(kind='scatter',x='age',y='class')
plt.title("class vs age")
plt.xlabel("age")
plt.ylabel("class")
plt.show()
In [ ]: